This analysis locally maps breakpoint density across the genome by binning breakpoint SV and CNV data. This notebook returns data that are passed on to 02-a-plot-chr-instability-heatmaps.Rmd and 02b-plot-chr-instability-by-histology.Rmd for plotting.
The breaks_density function bins the genome and counts how many chromosome breaks occur for each bin, given the given bin_size.
We will calculate breakpoint density across bins of the genome for each sample and histology group for all three breakpoint datasets set up by 00-setup-breakpoint_data.R.
intersection_of_breaks contains the intersection break counts for both SV and CNV break data.cnv_breaks contains the number of break counts for CNV.sv_breaks contains the number of break counts for SV.This notebook can be run via the command line from the top directory of the repository as follows:
Rscript -e "rmarkdown::render('analyses/chromosomal-instability/01-localization-of-breakpoints.Rmd',
clean = TRUE)"
# Change here and it will change the rest
bin_size <- 1e6
# Set seed so heatmaps turn out the same
set.seed(2020)
# Magrittr pipe
`%>%` <- dplyr::`%>%`
Read in the custom functions needed for this analysis.
source(file.path("util", "chr-break-calculate.R"))
source(file.path("util", "chr-break-plot.R"))
# Path to data directory
data_dir <- file.path("..", "..", "data")
scratch_dir <- file.path("..", "..", "scratch")
# The output from this notebook is used for downstream notebooks so we will save
# it in a special directory
output_dir <- "breakpoint-data"
# Create the output_dir if it does not exist
if (!dir.exists(output_dir)) {
dir.create(output_dir)
}
Here’s all the input files we will need:
chr_file <- file.path(data_dir, "WGS.hg38.strelka2.unpadded.bed")
metadata_file <- file.path(data_dir, "pbta-histologies.tsv")
breaks_file <- file.path(output_dir, "breaks_lists.RDS")
uncallable_file <- file.path(
"..", "copy_number_consensus_call", "ref",
"cnv_excluded_regions.bed"
)
The histology output file:
output_file <- file.path(output_dir, "histology_breakpoint_binned_counts.RDS")
Wrapper function to run break_density for all three datasets: CNV, SV, and intersection data.
all_break_density <- function(breaks_list, sample_ids, return_vector) {
# For given sample(s) ids, run breaks_density for all dataframes items in breaks_list.
#
# Args:
# breaks_list: a data.frame with chromosomal breaks.
# sample_ids: a single or multiple samples that correspond to the sample_col
# argument and will be used for calculating density.
# return_vector: If we don't need the full GenomicRanges object, only the total_counts, put TRUE
#
# Returns:
# GenomicRanges object that contains the densities mapped across the genome
# for the given samples and for all data.frames in breaks_list.
lapply(breaks_list, break_density, # Either intersection, CNV, SV data depending on the iteration.
sample_id = sample_ids,
start_col = "coord",
end_col = "coord",
window_size = bin_size, # Bin size to calculate breaks density
chr_sizes_vector = chr_sizes_vector, # This is the sizes of chromosomes used for binning
unsurveyed_bed = uncallable_bed, # This is the BED file that notes what regions are uncallable
perc_cutoff = .75, # What percentage of each bin needs to be callable for it not to be NA
return_vector = return_vector # If we don't need the full GenomicRanges object
)
}
Import data that was set up in breakpoint format in 00-setup-breakpoint-data.R.
breaks_list <- readr::read_rds(breaks_file)
Get the list of biospecimen IDs that are in this set.
common_samples <- unique(breaks_list$intersection_of_breaks$samples)
Set up metadata
# Read in the metadata
metadata <- readr::read_tsv(metadata_file) %>%
# Only keep metadata for the samples we are working with here
dplyr::filter(Kids_First_Biospecimen_ID %in% common_samples)
Parsed with column specification:
cols(
.default = col_character(),
OS_days = [32mcol_double()[39m,
age_last_update_days = [32mcol_double()[39m,
normal_fraction = [32mcol_double()[39m,
tumor_fraction = [32mcol_double()[39m,
tumor_ploidy = [32mcol_double()[39m,
molecular_subtype = [33mcol_logical()[39m
)
See spec(...) for full column specifications.
493 parsing failures.
row col expected actual file
2334 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2335 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../data/pbta-histologies.tsv'
2336 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2337 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2338 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
.... ................. .................. ...... .................................
See problems(...) for more details.
Set up chromosome size data that will be used for creating genome bins. It just so happens that this BED file: WGS.hg38.strelka2.unpadded.bed is actually just a list of the chromosome sizes so we are using that for now.
# Set up Chr sizes
chr_sizes_bed <- readr::read_tsv(chr_file,
col_names = c("chrom", "start", "end")
) %>%
# Reformat the chromosome variable to drop the "chr"
dplyr::mutate(chrom = factor(gsub("chr", "", chrom),
levels = c(1:22, "X", "Y", "M")
)) %>%
# Remove sex chromosomes
dplyr::filter(!(chrom %in% c("X", "Y", "M")))
Parsed with column specification:
cols(
chrom = [31mcol_character()[39m,
start = [32mcol_double()[39m,
end = [32mcol_double()[39m
)
# Make chromosome size named vector for Heatmap annotation
chr_sizes_vector <- chr_sizes_bed$end
names(chr_sizes_vector) <- chr_sizes_bed$chrom
Set up uncallable regions BED. We will use this to declare NAs in regions with too high of a percentage of uncallable regions.
uncallable_bed <- readr::read_tsv(uncallable_file,
col_names = c("chrom", "start", "end")
) %>%
# Reformat the chromosome variable to drop the "chr"
dplyr::mutate(chrom = factor(gsub("chr", "", chrom),
levels = c(1:22, "X", "Y")
)) %>%
dplyr::filter(
!is.na(chrom),
# Drop sex chromosomes
!(chrom %in% c("X", "Y", "M"))
)
Parsed with column specification:
cols(
chrom = [31mcol_character()[39m,
start = [32mcol_double()[39m,
end = [32mcol_double()[39m
)
Let’s reformat that the intersection_or_breaks data.frame. It contains both SV data and CNV data for its intersections. We’ll go by SV data here.
# For simplicity's sake, change the name of the intersection column we want to use
breaks_list$intersection_of_breaks <- breaks_list$intersection_of_breaks %>%
dplyr::rename(
coord = sv_ranges.start
)
Calculate sample breakpoint densities across the binned genome.
# Get a big list of break densities for each sample.
sample_densities <- lapply(common_samples, function(sample_id) {
all_break_density(
breaks_list = breaks_list,
sample_id = sample_id,
return_vector = TRUE
) # We only want the total counts,
# not the whole GenomicRanges object
})
# Bring along the sample IDs
names(sample_densities) <- common_samples
Switch list to be by breaks first, samples second.
sample_densities <- purrr::transpose(sample_densities)
# Extract the chromosome names for each bin
chr_bin_names <- names(sample_densities[[1]][[1]])
Turn each dataset into its own data.frame and write it to its own TSV file. This will be used by 02-a-plot-chr-instability-heatmaps.Rmd for heatmaps.
# Write the break densities each as their own files
purrr::imap(sample_densities, function(.x, name = .y) {
dplyr::bind_rows(.x) %>%
# Add on the chromosome names so they don't get lost
dplyr::mutate(chr_bin_names) %>%
# Write each to a TSV file
readr::write_tsv(file.path(
output_dir,
paste0(name, "_binned_counts.tsv")
))
})
$intersection_of_breaks
$cnv_breaks
$sv_breaks
NA
Same as was done for each sample, now we will calculate densities for each tumor-type group. This will be used by 02-b-plot-chr-instability-by-histology.Rmd for plots.
# Get a list of the tumor_types for which we have DNA-seq data
tumor_types <- metadata %>%
dplyr::filter(!is.na(short_histology), experimental_strategy != "RNA-Seq") %>%
dplyr::distinct(short_histology) %>%
dplyr::pull(short_histology)
Run the density calculations for the groups.
# Get a big list of break densities for each sample.
group_densities <- lapply(tumor_types, function(tumor_type) {
# Obtain a list of sample_ids to calculate break density for
sample_ids <- metadata %>%
dplyr::filter(metadata$short_histology == tumor_type) %>%
dplyr::pull(Kids_First_Biospecimen_ID)
# Print progress message
message(paste("Calculating breakpoint density for", tumor_type, "samples"))
# Calculate break density for all 3 datasets
all_break_density(
breaks_list = breaks_list,
sample_id = sample_ids, # We are supplying all samples for this tumor-type
return_vector = FALSE
) # We want the whole GenomicRanges object
})
Calculating breakpoint density for Ependymoma samples
Calculating breakpoint density for LGAT samples
Calculating breakpoint density for ATRT samples
Calculating breakpoint density for Choroid plexus tumor samples
Calculating breakpoint density for Meningioma samples
Calculating breakpoint density for Other samples
Calculating breakpoint density for CNS neuroblastoma samples
Calculating breakpoint density for Ganglioglioma samples
Calculating breakpoint density for Neurofibroma samples
Calculating breakpoint density for Chordoma samples
Calculating breakpoint density for DNET samples
Calculating breakpoint density for MPNST samples
Calculating breakpoint density for HGAT samples
Calculating breakpoint density for Pineoblastoma samples
Calculating breakpoint density for Teratoma samples
Calculating breakpoint density for Craniopharyngioma samples
Calculating breakpoint density for Germinoma samples
Calculating breakpoint density for CNS Rhabdomyosarcoma samples
Calculating breakpoint density for Schwannoma samples
Calculating breakpoint density for Oligodendroglioma samples
Calculating breakpoint density for Langerhans Cell histiocytosis samples
Calculating breakpoint density for Dysplasia samples
Calculating breakpoint density for Central neurocytoma samples
Calculating breakpoint density for CNS sarcoma samples
Calculating breakpoint density for CNS EFT-CIC samples
Calculating breakpoint density for Medulloblastoma samples
Calculating breakpoint density for ETMR samples
Calculating breakpoint density for Embryonal Tumor samples
# Bring along the tumor-type labels
names(group_densities) <- tumor_types
Save list of GenomicRanges objects to an RDS file.
# Save to an RDS file
readr::write_rds(
group_densities,
output_file
)
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] ProtGenerics_1.18.0 bitops_1.0-6
[3] matrixStats_0.55.0 bit64_0.9-7
[5] RColorBrewer_1.1-2 progress_1.2.2
[7] httr_1.4.0 GenomeInfoDb_1.22.0
[9] tools_3.6.0 backports_1.1.4
[11] R6_2.4.0 rpart_4.1-15
[13] Hmisc_4.2-0 DBI_1.0.0
[15] lazyeval_0.2.2 BiocGenerics_0.32.0
[17] colorspace_1.4-1 nnet_7.3-12
[19] tidyselect_0.2.5 gridExtra_2.3
[21] prettyunits_1.0.2 GGally_1.4.0
[23] bit_1.1-14 curl_3.3
[25] compiler_3.6.0 graph_1.62.0
[27] Biobase_2.46.0 htmlTable_1.13.1
[29] DelayedArray_0.12.2 labeling_0.3
[31] rtracklayer_1.46.0 ggbio_1.34.0
[33] scales_1.0.0 checkmate_1.9.4
[35] readr_1.3.1 RBGL_1.62.1
[37] askpass_1.1 rappdirs_0.3.1
[39] stringr_1.4.0 digest_0.6.20
[41] Rsamtools_2.2.1 foreign_0.8-71
[43] rmarkdown_1.13 XVector_0.26.0
[45] base64enc_0.1-3 dichromat_2.0-0
[47] pkgconfig_2.0.2 htmltools_0.3.6
[49] ensembldb_2.10.2 dbplyr_1.4.2
[51] BSgenome_1.54.0 htmlwidgets_1.3
[53] rlang_0.4.0 rstudioapi_0.10
[55] RSQLite_2.1.1 jsonlite_1.6
[57] BiocParallel_1.20.1 acepack_1.4.1
[59] dplyr_0.8.3 VariantAnnotation_1.32.0
[61] RCurl_1.95-4.12 magrittr_1.5
[63] GenomeInfoDbData_1.2.2 Formula_1.2-3
[65] Matrix_1.2-17 Rcpp_1.0.1
[67] munsell_0.5.0 S4Vectors_0.24.1
[69] stringi_1.4.3 yaml_2.2.0
[71] SummarizedExperiment_1.16.1 zlibbioc_1.32.0
[73] plyr_1.8.4 BiocFileCache_1.10.2
[75] grid_3.6.0 blob_1.1.1
[77] parallel_3.6.0 crayon_1.3.4
[79] lattice_0.20-38 cowplot_0.9.4
[81] Biostrings_2.54.0 splines_3.6.0
[83] GenomicFeatures_1.38.0 hms_0.4.2
[85] knitr_1.23 pillar_1.4.2
[87] GenomicRanges_1.38.0 reshape2_1.4.3
[89] biomaRt_2.42.0 stats4_3.6.0
[91] XML_3.98-1.20 glue_1.3.1
[93] evaluate_0.14 biovizBase_1.34.1
[95] latticeExtra_0.6-28 BiocManager_1.30.4
[97] data.table_1.12.2 tidyr_0.8.3
[99] gtable_0.3.0 openssl_1.4
[101] purrr_0.3.2 reshape_0.8.8
[103] assertthat_0.2.1 ggplot2_3.2.0
[105] xfun_0.8 AnnotationFilter_1.10.0
[107] survival_2.44-1.1 OrganismDbi_1.28.0
[109] tibble_2.1.3 GenomicAlignments_1.22.1
[111] AnnotationDbi_1.48.0 memoise_1.1.0
[113] IRanges_2.20.1 cluster_2.1.0